/*******************************************************************************
  ARTICLE	GAY, GOBBI, GONI (2025) "REVOLUTIONARY TRANSITIONS. INHERITANCE    
            CHANGE AND FERTILITY DECLINE" JOURNAL OF POLITICAL ECONOMY         
                                                                               
  AUTHORS	VICTOR GAY, PAULA GOBBI, MARC GONI                                 
  CONTACT	victor.gay@tse-fr.eu; paula.eugenia.gobbi@ulb.be; marc.goni@uib.no 
  VERSION	1.0 (MAY 2025)                                                     
  SOFTWARE	STATA SE 18                                                        
  LICENCE	MIT                                                                
--------------------------------------------------------------------------------

AGRICULTURAL CENSUS DATA PREPARATION DO FILE

This file prepares agricultural census data and generates the final agricultural census dataset for analysis.

Instructions: 
-------------
	open do-files from directory where they are placed; order matters; run whole code.

Do-file structure: 
------------------
	1. 1852 ARRONDISSEMENTS STRUCTURE
	2. 1852 AGRICULTURAL CENSUS DATA
	3. SOIL TEXTURE AND RUGEDDNESS

Main sources: 
-------------
	1852 agricultural census (Marin and Marraud 2011)
	
Other sources:
--------------
	1852 arrondissement shapefiles (Gay, 2020)
	Waterways (SANDRE, 2017)
	Ruggedness (Nunn and Puga, 2012)
	Soil texture (INRA, 1998)
		 
*/
********************************************************************************

version 18
set more off
clear all
global TEMP "../2_0_tempfiles"

timer on 1

* ==============================================================================
* 1. 1852 ARRONDISSEMENTS STRUCTURE
* ------------------------------------------------------------------------------

import excel "../../1_raw_data/1_21_agricensus/arrondissements.xlsx",        ///
	sheet("Sheet1") firstrow clear

/* sea and rivers indicators */
preserve
import dbase using "../../1_raw_data/1_21_agricensus/ARRONDISSEMENTS_1852/ARRONDISSEMENTS_1852_RIVERS.dbf", clear case(lower)
generate river = 0
replace river = 1 if toponyme == "la Loire"
replace river = 1 if toponyme == "Le Rhône"
replace river = 1 if toponyme == "La Garonne"
replace river = 1 if toponyme == "Le Rhin"
replace river = 1 if toponyme == "La Seine"
keep dep-ar_name coast river
duplicates drop
collapse (sum) coast river, by(dep-ar_name)
replace coast = 1 if coast == 2
keep depar coast river
rename depar depar1
generate depar = string(depar1,"%03.0f"), before(depar1)
drop depar1
save "$TEMP/arrondissements_river",replace
restore
merge 1:1 depar using "$TEMP/arrondissements_river", assert(1 3) nogenerate
erase "$TEMP/arrondissements_river.dta"

save "$TEMP/arrondissements_1852", replace
* ==============================================================================

* ==============================================================================
* 2. 1852 AGRICULTURAL CENSUS DATA
* ------------------------------------------------------------------------------

* NUMBER OF FARMS

import excel "../../1_raw_data/1_21_agricensus/economie_rurale.xls",         ///
	sheet("Feuil1") cellrange(A2:CY452) clear

rename A depar_census
label variable depar_census "Département-arrondissement census identifier [Census 1852]"
rename D dep_name_census
label variable dep_name_census "Département census name [Census 1852]"
rename C ar_name_census
label variable ar_name_census "Arrondissement census name [Census 1852]"
rename E proprio_1
label variable proprio_1 "Nombre de propriétaires ayant des propriétés sur le territoire sans y demeurer"
rename F proprio_2
label variable proprio_2 "Nombre de propriétaires demeurant sur le territoire sans cultiver eux-mêmes"
rename G proprio_3
label variable proprio_3 "Nombre de propriétaires ne cultivant que pour eux-mêmes"
rename H proprio_4
label variable proprio_4 "Nombre de propriétaires cultivant pour eux-mêmes et pour autrui (journaliers)"
rename I fermiers
label variable fermiers "Nombre des fermiers (payant un fermage fixe en argent)"
rename J metayers
label variable metayers "Nombre des métayers ou colons, etc. (donnant au propriétaire une part des produits)"
rename K fermes_1
label variable fermes_1 "Nombre des fermes cultivées par un maître valet (pour le compte du propriétaire)"
rename L fermes_2
label variable fermes_2 "Nombre des fermes cultivées par un régisseur"

drop B
drop if missing(depar_census)
drop if missing(dep_name_census)
drop M-CY

/* assign 0 to Paris, which has value -1 */
foreach v of varlist proprio_1-fermes_2 {
	replace `v' = 0 if `v' == -1
}

save "$TEMP/farms", replace

* LAND SIZE

import excel "../../1_raw_data/1_21_agricensus/cultures_diverses.xls",       ///
	sheet("Feuil1") cellrange(A2:GB452) clear

rename A depar_census
label variable depar_census "Département-arrondissement census identifier [Census 1852]"
rename D dep_name_census
label variable dep_name_census "Département census name [Census 1852]"
rename C ar_name_census
label variable ar_name_census "Arrondissement census name [Census 1852]"
rename FP terres_1
label variable terres_1 "Terres labourables, céréales"
rename FQ terres_2
label variable terres_2 "Terres labourables, racines et légumes"
rename FR terres_3
label variable terres_3 "Terres labourables, cultures diverses"
rename FS terres_4
label variable terres_4 "Terres labourables, prairies artificielles"
rename FT terres_5
label variable terres_5 "Terres labourables, jachère"
rename FU terres_total
label variable terres_total "Total des terres labourables"

drop B
drop if missing(depar_census)
drop if missing(dep_name_census)
drop E-FO FV-GB

save "$TEMP/land_size", replace

** MERGE DATA

use "$TEMP/farms", clear
merge 1:1 depar_census ar_name_census dep_name_census using "$TEMP/land_size", ///
	assert(3) nogenerate

merge 1:1 depar_census dep_name_census ar_name_census using                  ///
	"$TEMP/arrondissements_1852", assert(3) nogenerate
order dep_name depar ar_name, first
order coast river, after(ar_name)
erase "$TEMP/farms.dta"
erase "$TEMP/land_size.dta"

** AVERAGE FARM SIZE

/* land size measures */
generate land_size_1 = terres_total
label variable land_size_1 "Land size (terres labourables, hectares)"

generate land_size_2 = terres_1 + terres_2 + terres_3
label variable land_size_2 "Land size (cultivées, hectares)"

/* number of farms measures */
generate farms = proprio_1 + proprio_2 + proprio_3
label variable farms "Farms (owners)"

/* average farm size measures */
generate av_farm_size_1 = land_size_1 / farms
label variable av_farm_size_1 "Hectares (terres labourables) per farm (owners) [Census 1852]"

generate av_farm_size_2 = land_size_2 / farms
label variable av_farm_size_2 "Hectares (terres cultivées) per farm (owners) [Census 1852]"

drop proprio_1-farms

save "$TEMP/farm_size_arrond", replace
* ==============================================================================

* ==============================================================================
* 3. SOIL TEXTURE AND RUGEDDNESS
* ------------------------------------------------------------------------------

* SOIL TEXTURE

/* import intersected polygons */
import dbase using "../../1_raw_data/1_14_soil_texture/SOIL_TEXTURE/SOIL_TEXTURE_ARRONDISSEMENTS_INTERSECTED.dbf", clear case(lower)
order texture, last

/* weights */
by depar, sort: egen area_depar = total(area)
generate area_sh = area / area_depar

preserve
collapse (sum) area_sh, by(dep-ar_name)
assert area_sh > 0.9999
restore

/* area of sandy soil */
generate sandy = 0
replace  sandy = 1 if texture < 1.5 /* definition */
replace sandy = .a if texture == .
generate area_sandy = area * sandy
drop sandy

/* calculate weighted averages */
generate mean_texture = texture * area_sh
collapse (sum) mean_texture area_sandy, by(dep-ar_name area_depar)
replace mean_texture = . if mean_texture == 0
rename mean_texture texture

/* share of sandy soil */
generate sh_sandy = area_sandy / area_depar
replace sh_sandy = . if texture == .
drop area*

/* texture categories */
generate texture_cat = round(texture,1)

label variable texture "Soil texture (weighted average) [INRA 1998]"
label variable texture_cat "Soil texture (weighted average, rounded)  [INRA 1998]"

/* indicator for sandy */
generate sandy = 0
replace  sandy = 1 if texture < 1.5 /* definition */
replace sandy = .a if texture == .
label define sandy_lbl 0 "Not sandy" 1 "Sandy" .a "Missing texture"
label values sandy sandy_lbl
label variable sandy "Soil texture = sandy (indicator variable) [INRA 1998]"

/* export to QGIS */
preserve
keep depar texture texture_cat sandy sh_sandy
export delimited using "../../1_raw_data/1_14_soil_texture/soil_texture_arrondissements.csv", replace
restore

/* format identifiers */
rename dep dep1
generate dep = string(dep1,"%02.0f"), before(dep1)
drop dep1
rename ar ar1
generate ar = string(ar1,"%02.0f"), before(ar1)
drop ar1
rename depar depar1
generate depar = string(depar1,"%03.0f"), before(depar1)
drop depar1

/* merge to arrondissement frame */
merge 1:1 depar ar_name using "$TEMP/arrondissements_1852", keepusing(depar ar_name) ///
	keep(3) nogenerate

drop dep ar
save "$TEMP/texture_arrond", replace

* TERRAIN RUGGEDNESS

/* import intersected polygons */
import dbase using "../../1_raw_data/1_15_ruggedness/RUGGEDNESS/RUGGEDNESS_ARRONDISSEMENTS_INTERSECTED.dbf", clear case(lower)

/* weights */
by depar, sort: egen area_depar = total(area)
generate area_sh = area / area_depar

preserve
collapse (sum) area_sh, by(dep-ar_name)
assert area_sh > 0.9999
restore

/* calculate weighted averages */
generate mean_rugg = rugg * area_sh
collapse (sum) mean_rugg, by(dep-ar_name area_depar)
rename mean_rugg ruggedness
drop area

/* export to QGIS */
preserve
keep depar ruggedness
export delimited using "../../1_raw_data/1_15_ruggedness/ruggedness_arrondissements.csv", replace
restore

/* format identifiers */
rename dep dep1
generate dep = string(dep1,"%02.0f"), before(dep1)
drop dep1
rename ar ar1
generate ar = string(ar1,"%02.0f"), before(ar1)
drop ar1
rename depar depar1
generate depar = string(depar1,"%03.0f"), before(depar1)
drop depar1

drop dep ar
save "$TEMP/ruggedness_arrond", replace

* MERGE SOIL TEXTURE AND FARM SIZE

use "$TEMP/farm_size_arrond", clear
merge 1:1 dep_name depar ar_name using "$TEMP/texture_arrond",               ///
	assert(1 3) keep(3) nogenerate
merge 1:1 dep_name depar ar_name using "$TEMP/ruggedness_arrond",            ///
	assert(1 3) keep(3) nogenerate
erase "$TEMP/texture_arrond.dta"
erase "$TEMP/ruggedness_arrond.dta"

/* export to QGIS */
preserve
destring depar, replace
keep depar av_farm_size_1 av_farm_size_2
export delimited using "../../1_raw_data/1_21_agricensus/av_farm_size.csv", replace
restore

generate mountain = 0
replace  mountain = 1 if ruggedness > 3

keep if !missing(ruggedness)
keep if !missing(av_farm_size_1)
keep if !missing(sh_sandy)

/* label variables */
label variable dep_name  "Département name [Gay 2021]"
label variable depar  "Département-arrondissement identifier [Gay 2021]"
label variable ar_name  "Arrondissement name [Gay 2021]"
label variable mountain  "Mountainous indicator [Nunn Puga 2012]"
label variable coast  "Coastal arrondissement [GEOFLA 2011]"
label variable river  "Presence of a river in arrondissement [SANDRE 2017]"
label variable sh_sandy  "Share of sandy soils [INRA 1998]"
label variable ruggedness  "Ruggedness (100 meters) [Nunn Puga 2012]"


/* save dataset */
compress
save "../../3_outputs/3_1_datasets/agricensus.dta", replace
* ==============================================================================

timer off 1 /* 10 seconds */
timer list	